{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"/data/credit-default.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>checking_balance</th>\n",
       "      <th>months_loan_duration</th>\n",
       "      <th>credit_history</th>\n",
       "      <th>purpose</th>\n",
       "      <th>amount</th>\n",
       "      <th>savings_balance</th>\n",
       "      <th>employment_length</th>\n",
       "      <th>installment_rate</th>\n",
       "      <th>personal_status</th>\n",
       "      <th>other_debtors</th>\n",
       "      <th>...</th>\n",
       "      <th>property</th>\n",
       "      <th>age</th>\n",
       "      <th>installment_plan</th>\n",
       "      <th>housing</th>\n",
       "      <th>existing_credits</th>\n",
       "      <th>default</th>\n",
       "      <th>dependents</th>\n",
       "      <th>telephone</th>\n",
       "      <th>foreign_worker</th>\n",
       "      <th>job</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>6</td>\n",
       "      <td>critical</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>1169</td>\n",
       "      <td>unknown</td>\n",
       "      <td>&gt; 7 yrs</td>\n",
       "      <td>4</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>67</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1 - 200 DM</td>\n",
       "      <td>48</td>\n",
       "      <td>repaid</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>5951</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>22</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>unknown</td>\n",
       "      <td>12</td>\n",
       "      <td>critical</td>\n",
       "      <td>education</td>\n",
       "      <td>2096</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>4 - 7 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>49</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>unskilled resident</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>42</td>\n",
       "      <td>repaid</td>\n",
       "      <td>furniture</td>\n",
       "      <td>7882</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>4 - 7 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>single male</td>\n",
       "      <td>guarantor</td>\n",
       "      <td>...</td>\n",
       "      <td>building society savings</td>\n",
       "      <td>45</td>\n",
       "      <td>none</td>\n",
       "      <td>for free</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>24</td>\n",
       "      <td>delayed</td>\n",
       "      <td>car (new)</td>\n",
       "      <td>4870</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>3</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>unknown/none</td>\n",
       "      <td>53</td>\n",
       "      <td>none</td>\n",
       "      <td>for free</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>unknown</td>\n",
       "      <td>12</td>\n",
       "      <td>repaid</td>\n",
       "      <td>furniture</td>\n",
       "      <td>1736</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>4 - 7 yrs</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>31</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>unskilled resident</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>30</td>\n",
       "      <td>repaid</td>\n",
       "      <td>car (used)</td>\n",
       "      <td>3857</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>4</td>\n",
       "      <td>divorced male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>building society savings</td>\n",
       "      <td>40</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>mangement self-employed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>unknown</td>\n",
       "      <td>12</td>\n",
       "      <td>repaid</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>804</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>&gt; 7 yrs</td>\n",
       "      <td>4</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>other</td>\n",
       "      <td>38</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>45</td>\n",
       "      <td>repaid</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>1845</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>4</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>unknown/none</td>\n",
       "      <td>23</td>\n",
       "      <td>none</td>\n",
       "      <td>for free</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>1 - 200 DM</td>\n",
       "      <td>45</td>\n",
       "      <td>critical</td>\n",
       "      <td>car (used)</td>\n",
       "      <td>4576</td>\n",
       "      <td>101 - 500 DM</td>\n",
       "      <td>unemployed</td>\n",
       "      <td>3</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>other</td>\n",
       "      <td>27</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    checking_balance  months_loan_duration credit_history     purpose  amount  \\\n",
       "0             < 0 DM                     6       critical    radio/tv    1169   \n",
       "1         1 - 200 DM                    48         repaid    radio/tv    5951   \n",
       "2            unknown                    12       critical   education    2096   \n",
       "3             < 0 DM                    42         repaid   furniture    7882   \n",
       "4             < 0 DM                    24        delayed   car (new)    4870   \n",
       "..               ...                   ...            ...         ...     ...   \n",
       "995          unknown                    12         repaid   furniture    1736   \n",
       "996           < 0 DM                    30         repaid  car (used)    3857   \n",
       "997          unknown                    12         repaid    radio/tv     804   \n",
       "998           < 0 DM                    45         repaid    radio/tv    1845   \n",
       "999       1 - 200 DM                    45       critical  car (used)    4576   \n",
       "\n",
       "    savings_balance employment_length  installment_rate personal_status  \\\n",
       "0           unknown           > 7 yrs                 4     single male   \n",
       "1          < 100 DM         1 - 4 yrs                 2          female   \n",
       "2          < 100 DM         4 - 7 yrs                 2     single male   \n",
       "3          < 100 DM         4 - 7 yrs                 2     single male   \n",
       "4          < 100 DM         1 - 4 yrs                 3     single male   \n",
       "..              ...               ...               ...             ...   \n",
       "995        < 100 DM         4 - 7 yrs                 3          female   \n",
       "996        < 100 DM         1 - 4 yrs                 4   divorced male   \n",
       "997        < 100 DM           > 7 yrs                 4     single male   \n",
       "998        < 100 DM         1 - 4 yrs                 4     single male   \n",
       "999    101 - 500 DM        unemployed                 3     single male   \n",
       "\n",
       "    other_debtors  ...                  property age  installment_plan  \\\n",
       "0            none  ...               real estate  67              none   \n",
       "1            none  ...               real estate  22              none   \n",
       "2            none  ...               real estate  49              none   \n",
       "3       guarantor  ...  building society savings  45              none   \n",
       "4            none  ...              unknown/none  53              none   \n",
       "..            ...  ...                       ...  ..               ...   \n",
       "995          none  ...               real estate  31              none   \n",
       "996          none  ...  building society savings  40              none   \n",
       "997          none  ...                     other  38              none   \n",
       "998          none  ...              unknown/none  23              none   \n",
       "999          none  ...                     other  27              none   \n",
       "\n",
       "      housing existing_credits  default  dependents  telephone foreign_worker  \\\n",
       "0         own                2        1           1        yes            yes   \n",
       "1         own                1        2           1       none            yes   \n",
       "2         own                1        1           2       none            yes   \n",
       "3    for free                1        1           2       none            yes   \n",
       "4    for free                2        2           2       none            yes   \n",
       "..        ...              ...      ...         ...        ...            ...   \n",
       "995       own                1        1           1       none            yes   \n",
       "996       own                1        1           1        yes            yes   \n",
       "997       own                1        1           1       none            yes   \n",
       "998  for free                1        2           1        yes            yes   \n",
       "999       own                1        1           1       none            yes   \n",
       "\n",
       "                         job  \n",
       "0           skilled employee  \n",
       "1           skilled employee  \n",
       "2         unskilled resident  \n",
       "3           skilled employee  \n",
       "4           skilled employee  \n",
       "..                       ...  \n",
       "995       unskilled resident  \n",
       "996  mangement self-employed  \n",
       "997         skilled employee  \n",
       "998         skilled employee  \n",
       "999         skilled employee  \n",
       "\n",
       "[1000 rows x 21 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000 entries, 0 to 999\n",
      "Data columns (total 21 columns):\n",
      " #   Column                Non-Null Count  Dtype \n",
      "---  ------                --------------  ----- \n",
      " 0   checking_balance      1000 non-null   object\n",
      " 1   months_loan_duration  1000 non-null   int64 \n",
      " 2   credit_history        1000 non-null   object\n",
      " 3   purpose               1000 non-null   object\n",
      " 4   amount                1000 non-null   int64 \n",
      " 5   savings_balance       1000 non-null   object\n",
      " 6   employment_length     1000 non-null   object\n",
      " 7   installment_rate      1000 non-null   int64 \n",
      " 8   personal_status       1000 non-null   object\n",
      " 9   other_debtors         1000 non-null   object\n",
      " 10  residence_history     1000 non-null   int64 \n",
      " 11  property              1000 non-null   object\n",
      " 12  age                   1000 non-null   int64 \n",
      " 13  installment_plan      1000 non-null   object\n",
      " 14  housing               1000 non-null   object\n",
      " 15  existing_credits      1000 non-null   int64 \n",
      " 16  default               1000 non-null   int64 \n",
      " 17  dependents            1000 non-null   int64 \n",
      " 18  telephone             1000 non-null   object\n",
      " 19  foreign_worker        1000 non-null   object\n",
      " 20  job                   1000 non-null   object\n",
      "dtypes: int64(8), object(13)\n",
      "memory usage: 164.2+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    700\n",
       "2    300\n",
       "Name: default, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.default.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.7\n",
       "2    0.3\n",
       "Name: default, dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.default.value_counts()/len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "1 -> Negative Sample (0) - > customer is not default\n",
    "2 -> Positive Sample (1) -> customer is default (not able to pay back the loan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "P -> 76\n",
    "N -> 45"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.628099173553719"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1 = 76 / (76 + 45)\n",
    "p1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p0 + p1 = 1\n",
    "\n",
    "gini = po*(1-po) + p1* (1-p1) = (1- p1) * p1 + p1*(1-p1) = 2 * p1 * (1-p1) = 2 * p1 * p0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4671812034697083"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2 * p1 * (1-p1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "P -> 11\n",
    "N -> 32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7441860465116279"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1 = 32/(32 + 11)\n",
    "p1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.3807463493780422"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2 * p1 * (1-p1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "P -> 9\n",
    "N -> 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "9/(9+1) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (<ipython-input-11-d016f2221685>, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-11-d016f2221685>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    p-> 4\u001b[0m\n\u001b[0m      ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "p-> 4\n",
    "n -> 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "6/(4+6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "p1 = 214/700"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4245061224489796"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2 * p1 * (1-p1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.3545"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(.488 + .221)/2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0695"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    ".424 - .3545"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.3827257142857143"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(424 * .488 + 276 * .221)/ 700 # weighted gini avg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.04178040816326534"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " 0.4245061224489796 - 0.3827257142857143 #information gain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - sum(pi * log(pi)) -> Entropy\n",
    "# sum(pi * (1-pi))  -> Gini = sum(pi) - sum (pi * pi) = 1 - sum(pi*pi) = 1 - sum(pi * pj) where i != j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# objective of the tree is find the condition by which it maximizes the information gains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# stump -> a condition only on one feature, not more than one."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# purpose of loan - P1, P2, P3 (categorical)\n",
    "# age - 20, 60 (cont)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (<ipython-input-27-e6863e5c2cb0>, line 6)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-27-e6863e5c2cb0>\"\u001b[0;36m, line \u001b[0;32m6\u001b[0m\n\u001b[0;31m    age < 25 (percentile 10)\u001b[0m\n\u001b[0m                          ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "# candidates for a stump\n",
    "# for each candidate find the information gain\n",
    "# select the condition which gives the max information gain \n",
    "purpose = p1  \n",
    "purpose = p2\n",
    "purpose = p3\n",
    "\n",
    "age < 25 (percentile 10)\n",
    "age < 30 (percentile 20)\n",
    "age < 37 (percentile 30)\n",
    "..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 21)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>checking_balance</th>\n",
       "      <th>months_loan_duration</th>\n",
       "      <th>credit_history</th>\n",
       "      <th>purpose</th>\n",
       "      <th>amount</th>\n",
       "      <th>savings_balance</th>\n",
       "      <th>employment_length</th>\n",
       "      <th>installment_rate</th>\n",
       "      <th>personal_status</th>\n",
       "      <th>other_debtors</th>\n",
       "      <th>...</th>\n",
       "      <th>property</th>\n",
       "      <th>age</th>\n",
       "      <th>installment_plan</th>\n",
       "      <th>housing</th>\n",
       "      <th>existing_credits</th>\n",
       "      <th>default</th>\n",
       "      <th>dependents</th>\n",
       "      <th>telephone</th>\n",
       "      <th>foreign_worker</th>\n",
       "      <th>job</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>6</td>\n",
       "      <td>critical</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>1169</td>\n",
       "      <td>unknown</td>\n",
       "      <td>&gt; 7 yrs</td>\n",
       "      <td>4</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>67</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1 - 200 DM</td>\n",
       "      <td>48</td>\n",
       "      <td>repaid</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>5951</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>22</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>unknown</td>\n",
       "      <td>12</td>\n",
       "      <td>critical</td>\n",
       "      <td>education</td>\n",
       "      <td>2096</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>4 - 7 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>49</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>unskilled resident</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>42</td>\n",
       "      <td>repaid</td>\n",
       "      <td>furniture</td>\n",
       "      <td>7882</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>4 - 7 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>single male</td>\n",
       "      <td>guarantor</td>\n",
       "      <td>...</td>\n",
       "      <td>building society savings</td>\n",
       "      <td>45</td>\n",
       "      <td>none</td>\n",
       "      <td>for free</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>24</td>\n",
       "      <td>delayed</td>\n",
       "      <td>car (new)</td>\n",
       "      <td>4870</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>3</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>unknown/none</td>\n",
       "      <td>53</td>\n",
       "      <td>none</td>\n",
       "      <td>for free</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  checking_balance  months_loan_duration credit_history    purpose  amount  \\\n",
       "0           < 0 DM                     6       critical   radio/tv    1169   \n",
       "1       1 - 200 DM                    48         repaid   radio/tv    5951   \n",
       "2          unknown                    12       critical  education    2096   \n",
       "3           < 0 DM                    42         repaid  furniture    7882   \n",
       "4           < 0 DM                    24        delayed  car (new)    4870   \n",
       "\n",
       "  savings_balance employment_length  installment_rate personal_status  \\\n",
       "0         unknown           > 7 yrs                 4     single male   \n",
       "1        < 100 DM         1 - 4 yrs                 2          female   \n",
       "2        < 100 DM         4 - 7 yrs                 2     single male   \n",
       "3        < 100 DM         4 - 7 yrs                 2     single male   \n",
       "4        < 100 DM         1 - 4 yrs                 3     single male   \n",
       "\n",
       "  other_debtors  ...                  property age  installment_plan  \\\n",
       "0          none  ...               real estate  67              none   \n",
       "1          none  ...               real estate  22              none   \n",
       "2          none  ...               real estate  49              none   \n",
       "3     guarantor  ...  building society savings  45              none   \n",
       "4          none  ...              unknown/none  53              none   \n",
       "\n",
       "    housing existing_credits  default  dependents  telephone foreign_worker  \\\n",
       "0       own                2        1           1        yes            yes   \n",
       "1       own                1        2           1       none            yes   \n",
       "2       own                1        1           2       none            yes   \n",
       "3  for free                1        1           2       none            yes   \n",
       "4  for free                2        2           2       none            yes   \n",
       "\n",
       "                  job  \n",
       "0    skilled employee  \n",
       "1    skilled employee  \n",
       "2  unskilled resident  \n",
       "3    skilled employee  \n",
       "4    skilled employee  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "g1 = df[df.purpose == \"radio/tv\"]\n",
    "g2 =  df[df.purpose != \"radio/tv\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(280, 720)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(g1), len(g2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.778571\n",
       "2    0.221429\n",
       "Name: default, dtype: float64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g1.default.value_counts()/len(g1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.344796395918"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2 * 0.778571 * 0.221429"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.669444\n",
       "2    0.330556\n",
       "Name: default, dtype: float64"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g2.default.value_counts()/len(g2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.44257746172800005"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2 * .669444 * 0.330556 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.7\n",
       "2    0.3\n",
       "Name: default, dtype: float64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.default.value_counts()/len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.42"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gini_root = 2 * .7 * .3\n",
    "gini_root"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.004801236698799893"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "0.42 - (280 * 0.344796395918 + 720 * 0.44257746172800005)/ (280+720) \n",
    "# information gain using purpose = \"Radio/tv\" as condition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>checking_balance</th>\n",
       "      <th>months_loan_duration</th>\n",
       "      <th>credit_history</th>\n",
       "      <th>purpose</th>\n",
       "      <th>amount</th>\n",
       "      <th>savings_balance</th>\n",
       "      <th>employment_length</th>\n",
       "      <th>installment_rate</th>\n",
       "      <th>personal_status</th>\n",
       "      <th>other_debtors</th>\n",
       "      <th>...</th>\n",
       "      <th>property</th>\n",
       "      <th>age</th>\n",
       "      <th>installment_plan</th>\n",
       "      <th>housing</th>\n",
       "      <th>existing_credits</th>\n",
       "      <th>default</th>\n",
       "      <th>dependents</th>\n",
       "      <th>telephone</th>\n",
       "      <th>foreign_worker</th>\n",
       "      <th>job</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>6</td>\n",
       "      <td>critical</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>1169</td>\n",
       "      <td>unknown</td>\n",
       "      <td>&gt; 7 yrs</td>\n",
       "      <td>4</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>67</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1 - 200 DM</td>\n",
       "      <td>48</td>\n",
       "      <td>repaid</td>\n",
       "      <td>radio/tv</td>\n",
       "      <td>5951</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>22</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>unknown</td>\n",
       "      <td>12</td>\n",
       "      <td>critical</td>\n",
       "      <td>education</td>\n",
       "      <td>2096</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>4 - 7 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>real estate</td>\n",
       "      <td>49</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>unskilled resident</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>42</td>\n",
       "      <td>repaid</td>\n",
       "      <td>furniture</td>\n",
       "      <td>7882</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>4 - 7 yrs</td>\n",
       "      <td>2</td>\n",
       "      <td>single male</td>\n",
       "      <td>guarantor</td>\n",
       "      <td>...</td>\n",
       "      <td>building society savings</td>\n",
       "      <td>45</td>\n",
       "      <td>none</td>\n",
       "      <td>for free</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt; 0 DM</td>\n",
       "      <td>24</td>\n",
       "      <td>delayed</td>\n",
       "      <td>car (new)</td>\n",
       "      <td>4870</td>\n",
       "      <td>&lt; 100 DM</td>\n",
       "      <td>1 - 4 yrs</td>\n",
       "      <td>3</td>\n",
       "      <td>single male</td>\n",
       "      <td>none</td>\n",
       "      <td>...</td>\n",
       "      <td>unknown/none</td>\n",
       "      <td>53</td>\n",
       "      <td>none</td>\n",
       "      <td>for free</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "      <td>skilled employee</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  checking_balance  months_loan_duration credit_history    purpose  amount  \\\n",
       "0           < 0 DM                     6       critical   radio/tv    1169   \n",
       "1       1 - 200 DM                    48         repaid   radio/tv    5951   \n",
       "2          unknown                    12       critical  education    2096   \n",
       "3           < 0 DM                    42         repaid  furniture    7882   \n",
       "4           < 0 DM                    24        delayed  car (new)    4870   \n",
       "\n",
       "  savings_balance employment_length  installment_rate personal_status  \\\n",
       "0         unknown           > 7 yrs                 4     single male   \n",
       "1        < 100 DM         1 - 4 yrs                 2          female   \n",
       "2        < 100 DM         4 - 7 yrs                 2     single male   \n",
       "3        < 100 DM         4 - 7 yrs                 2     single male   \n",
       "4        < 100 DM         1 - 4 yrs                 3     single male   \n",
       "\n",
       "  other_debtors  ...                  property age  installment_plan  \\\n",
       "0          none  ...               real estate  67              none   \n",
       "1          none  ...               real estate  22              none   \n",
       "2          none  ...               real estate  49              none   \n",
       "3     guarantor  ...  building society savings  45              none   \n",
       "4          none  ...              unknown/none  53              none   \n",
       "\n",
       "    housing existing_credits  default  dependents  telephone foreign_worker  \\\n",
       "0       own                2        1           1        yes            yes   \n",
       "1       own                1        2           1       none            yes   \n",
       "2       own                1        1           2       none            yes   \n",
       "3  for free                1        1           2       none            yes   \n",
       "4  for free                2        2           2       none            yes   \n",
       "\n",
       "                  job  \n",
       "0    skilled employee  \n",
       "1    skilled employee  \n",
       "2  unskilled resident  \n",
       "3    skilled employee  \n",
       "4    skilled employee  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000 entries, 0 to 999\n",
      "Data columns (total 21 columns):\n",
      " #   Column                Non-Null Count  Dtype \n",
      "---  ------                --------------  ----- \n",
      " 0   checking_balance      1000 non-null   object\n",
      " 1   months_loan_duration  1000 non-null   int64 \n",
      " 2   credit_history        1000 non-null   object\n",
      " 3   purpose               1000 non-null   object\n",
      " 4   amount                1000 non-null   int64 \n",
      " 5   savings_balance       1000 non-null   object\n",
      " 6   employment_length     1000 non-null   object\n",
      " 7   installment_rate      1000 non-null   int64 \n",
      " 8   personal_status       1000 non-null   object\n",
      " 9   other_debtors         1000 non-null   object\n",
      " 10  residence_history     1000 non-null   int64 \n",
      " 11  property              1000 non-null   object\n",
      " 12  age                   1000 non-null   int64 \n",
      " 13  installment_plan      1000 non-null   object\n",
      " 14  housing               1000 non-null   object\n",
      " 15  existing_credits      1000 non-null   int64 \n",
      " 16  default               1000 non-null   int64 \n",
      " 17  dependents            1000 non-null   int64 \n",
      " 18  telephone             1000 non-null   object\n",
      " 19  foreign_worker        1000 non-null   object\n",
      " 20  job                   1000 non-null   object\n",
      "dtypes: int64(8), object(13)\n",
      "memory usage: 164.2+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_dummy  = pd.get_dummies(df, drop_first=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000 entries, 0 to 999\n",
      "Data columns (total 49 columns):\n",
      " #   Column                                 Non-Null Count  Dtype\n",
      "---  ------                                 --------------  -----\n",
      " 0   months_loan_duration                   1000 non-null   int64\n",
      " 1   amount                                 1000 non-null   int64\n",
      " 2   installment_rate                       1000 non-null   int64\n",
      " 3   residence_history                      1000 non-null   int64\n",
      " 4   age                                    1000 non-null   int64\n",
      " 5   existing_credits                       1000 non-null   int64\n",
      " 6   default                                1000 non-null   int64\n",
      " 7   dependents                             1000 non-null   int64\n",
      " 8   checking_balance_< 0 DM                1000 non-null   uint8\n",
      " 9   checking_balance_> 200 DM              1000 non-null   uint8\n",
      " 10  checking_balance_unknown               1000 non-null   uint8\n",
      " 11  credit_history_delayed                 1000 non-null   uint8\n",
      " 12  credit_history_fully repaid            1000 non-null   uint8\n",
      " 13  credit_history_fully repaid this bank  1000 non-null   uint8\n",
      " 14  credit_history_repaid                  1000 non-null   uint8\n",
      " 15  purpose_car (new)                      1000 non-null   uint8\n",
      " 16  purpose_car (used)                     1000 non-null   uint8\n",
      " 17  purpose_domestic appliances            1000 non-null   uint8\n",
      " 18  purpose_education                      1000 non-null   uint8\n",
      " 19  purpose_furniture                      1000 non-null   uint8\n",
      " 20  purpose_others                         1000 non-null   uint8\n",
      " 21  purpose_radio/tv                       1000 non-null   uint8\n",
      " 22  purpose_repairs                        1000 non-null   uint8\n",
      " 23  purpose_retraining                     1000 non-null   uint8\n",
      " 24  savings_balance_501 - 1000 DM          1000 non-null   uint8\n",
      " 25  savings_balance_< 100 DM               1000 non-null   uint8\n",
      " 26  savings_balance_> 1000 DM              1000 non-null   uint8\n",
      " 27  savings_balance_unknown                1000 non-null   uint8\n",
      " 28  employment_length_1 - 4 yrs            1000 non-null   uint8\n",
      " 29  employment_length_4 - 7 yrs            1000 non-null   uint8\n",
      " 30  employment_length_> 7 yrs              1000 non-null   uint8\n",
      " 31  employment_length_unemployed           1000 non-null   uint8\n",
      " 32  personal_status_female                 1000 non-null   uint8\n",
      " 33  personal_status_married male           1000 non-null   uint8\n",
      " 34  personal_status_single male            1000 non-null   uint8\n",
      " 35  other_debtors_guarantor                1000 non-null   uint8\n",
      " 36  other_debtors_none                     1000 non-null   uint8\n",
      " 37  property_other                         1000 non-null   uint8\n",
      " 38  property_real estate                   1000 non-null   uint8\n",
      " 39  property_unknown/none                  1000 non-null   uint8\n",
      " 40  installment_plan_none                  1000 non-null   uint8\n",
      " 41  installment_plan_stores                1000 non-null   uint8\n",
      " 42  housing_own                            1000 non-null   uint8\n",
      " 43  housing_rent                           1000 non-null   uint8\n",
      " 44  telephone_yes                          1000 non-null   uint8\n",
      " 45  foreign_worker_yes                     1000 non-null   uint8\n",
      " 46  job_skilled employee                   1000 non-null   uint8\n",
      " 47  job_unemployed non-resident            1000 non-null   uint8\n",
      " 48  job_unskilled resident                 1000 non-null   uint8\n",
      "dtypes: int64(8), uint8(41)\n",
      "memory usage: 102.7 KB\n"
     ]
    }
   ],
   "source": [
    "df_dummy.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "target = \"default\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = df_dummy[target]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df_dummy.drop(columns=target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 48)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>months_loan_duration</th>\n",
       "      <th>amount</th>\n",
       "      <th>installment_rate</th>\n",
       "      <th>residence_history</th>\n",
       "      <th>age</th>\n",
       "      <th>existing_credits</th>\n",
       "      <th>dependents</th>\n",
       "      <th>checking_balance_&lt; 0 DM</th>\n",
       "      <th>checking_balance_&gt; 200 DM</th>\n",
       "      <th>checking_balance_unknown</th>\n",
       "      <th>...</th>\n",
       "      <th>property_unknown/none</th>\n",
       "      <th>installment_plan_none</th>\n",
       "      <th>installment_plan_stores</th>\n",
       "      <th>housing_own</th>\n",
       "      <th>housing_rent</th>\n",
       "      <th>telephone_yes</th>\n",
       "      <th>foreign_worker_yes</th>\n",
       "      <th>job_skilled employee</th>\n",
       "      <th>job_unemployed non-resident</th>\n",
       "      <th>job_unskilled resident</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>1169</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>67</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>48</td>\n",
       "      <td>5951</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>22</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12</td>\n",
       "      <td>2096</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>49</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>42</td>\n",
       "      <td>7882</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>24</td>\n",
       "      <td>4870</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>53</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   months_loan_duration  amount  installment_rate  residence_history  age  \\\n",
       "0                     6    1169                 4                  4   67   \n",
       "1                    48    5951                 2                  2   22   \n",
       "2                    12    2096                 2                  3   49   \n",
       "3                    42    7882                 2                  4   45   \n",
       "4                    24    4870                 3                  4   53   \n",
       "\n",
       "   existing_credits  dependents  checking_balance_< 0 DM  \\\n",
       "0                 2           1                        1   \n",
       "1                 1           1                        0   \n",
       "2                 1           2                        0   \n",
       "3                 1           2                        1   \n",
       "4                 2           2                        1   \n",
       "\n",
       "   checking_balance_> 200 DM  checking_balance_unknown  ...  \\\n",
       "0                          0                         0  ...   \n",
       "1                          0                         0  ...   \n",
       "2                          0                         1  ...   \n",
       "3                          0                         0  ...   \n",
       "4                          0                         0  ...   \n",
       "\n",
       "   property_unknown/none  installment_plan_none  installment_plan_stores  \\\n",
       "0                      0                      1                        0   \n",
       "1                      0                      1                        0   \n",
       "2                      0                      1                        0   \n",
       "3                      0                      1                        0   \n",
       "4                      1                      1                        0   \n",
       "\n",
       "   housing_own  housing_rent  telephone_yes  foreign_worker_yes  \\\n",
       "0            1             0              1                   1   \n",
       "1            1             0              0                   1   \n",
       "2            1             0              0                   1   \n",
       "3            0             0              0                   1   \n",
       "4            0             0              0                   1   \n",
       "\n",
       "   job_skilled employee  job_unemployed non-resident  job_unskilled resident  \n",
       "0                     1                            0                       0  \n",
       "1                     1                            0                       0  \n",
       "2                     0                            0                       1  \n",
       "3                     1                            0                       0  \n",
       "4                     1                            0                       0  \n",
       "\n",
       "[5 rows x 48 columns]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import model_selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                       max_depth=None, max_features=None, max_leaf_nodes=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                       random_state=None, splitter='best')"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier()\n",
    "est.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test_pred = est.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6433333333333333"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics.accuracy_score(y_test, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.694286\n",
       "2    0.305714\n",
       "Name: default, dtype: float64"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(y_train).value_counts()/len(y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.713333\n",
       "2    0.286667\n",
       "Name: default, dtype: float64"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(y_test).value_counts()/len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actual</th>\n",
       "      <th>prediction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>507</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>818</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>452</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>368</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>242</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>415</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>347</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>349</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>300 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     actual  prediction\n",
       "507       2           2\n",
       "818       1           2\n",
       "452       1           1\n",
       "368       2           2\n",
       "242       2           2\n",
       "..      ...         ...\n",
       "459       1           2\n",
       "415       1           1\n",
       "61        1           1\n",
       "347       1           2\n",
       "349       2           1\n",
       "\n",
       "[300 rows x 2 columns]"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({\"actual\": y_test, \"prediction\": y_test_pred})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "193"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(y_test == y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6433333333333333"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "193/len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[158,  56],\n",
       "       [ 51,  35]])"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics.confusion_matrix(y_test, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "193"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "158 + 35"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.76      0.74      0.75       214\n",
      "           2       0.38      0.41      0.40        86\n",
      "\n",
      "    accuracy                           0.64       300\n",
      "   macro avg       0.57      0.57      0.57       300\n",
      "weighted avg       0.65      0.64      0.65       300\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(metrics.classification_report(y_test, y_test_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train accuracy 1.0\n",
      "test accuracy 0.6566666666666666\n"
     ]
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier()\n",
    "est.fit(X_train, y_train)\n",
    "y_train_pred = est.predict(X_train)\n",
    "y_test_pred = est.predict(X_test)\n",
    "print(\"train accuracy\", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy\", metrics.accuracy_score(y_test, y_test_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import export_graphviz\n",
    "export_graphviz(est, out_file = \"tree.dot\", feature_names = X.columns, filled=True)\n",
    "!dot -Tpng tree.dot -o tree.png"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train accuracy 0.7514285714285714\n",
      "test accuracy 0.7333333333333333\n"
     ]
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier(max_depth=3)\n",
    "est.fit(X_train, y_train)\n",
    "y_train_pred = est.predict(X_train)\n",
    "y_test_pred = est.predict(X_test)\n",
    "print(\"train accuracy\", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy\", metrics.accuracy_score(y_test, y_test_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import export_graphviz\n",
    "export_graphviz(est, out_file = \"tree.dot\", feature_names = X.columns, filled=True)\n",
    "!dot -Tpng tree.dot -o tree.png"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train accuracy 0.7385714285714285\n",
      "test accuracy 0.7433333333333333\n"
     ]
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=20)\n",
    "est.fit(X_train, y_train)\n",
    "y_train_pred = est.predict(X_train)\n",
    "y_test_pred = est.predict(X_test)\n",
    "print(\"train accuracy\", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy\", metrics.accuracy_score(y_test, y_test_pred))\n",
    "from sklearn.tree import export_graphviz\n",
    "export_graphviz(est, out_file = \"tree.dot\", feature_names = X.columns, filled=True)\n",
    "!dot -Tpng tree.dot -o tree.png"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train accuracy 0.7385714285714285\n",
      "test accuracy 0.7433333333333333\n"
     ]
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=20, criterion=\"entropy\")\n",
    "est.fit(X_train, y_train)\n",
    "y_train_pred = est.predict(X_train)\n",
    "y_test_pred = est.predict(X_test)\n",
    "print(\"train accuracy\", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy\", metrics.accuracy_score(y_test, y_test_pred))\n",
    "from sklearn.tree import export_graphviz\n",
    "export_graphviz(est, out_file = \"tree.dot\", feature_names = X.columns, filled=True)\n",
    "!dot -Tpng tree.dot -o tree.png"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.67857143, 0.69285714, 0.67142857, 0.73571429, 0.70714286])"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = model_selection.cross_val_score(est, X_train, y_train, cv = 5)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6971428571428572"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 324 candidates, totalling 1620 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.\n",
      "[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s\n",
      "[Parallel(n_jobs=8)]: Done 1592 tasks      | elapsed:    2.7s\n",
      "[Parallel(n_jobs=8)]: Done 1620 out of 1620 | elapsed:    2.8s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score=nan,\n",
       "             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,\n",
       "                                              criterion='entropy', max_depth=3,\n",
       "                                              max_features=None,\n",
       "                                              max_leaf_nodes=None,\n",
       "                                              min_impurity_decrease=0.0,\n",
       "                                              min_impurity_split=None,\n",
       "                                              min_samples_leaf=20,\n",
       "                                              min_samples_split=2,\n",
       "                                              min_weight_fraction_leaf=0.0,\n",
       "                                              presort='deprecated',\n",
       "                                              random_state=None,\n",
       "                                              splitter='best'),\n",
       "             iid='deprecated', n_jobs=8,\n",
       "             param_grid={'criterion': ['gini', 'entropy'],\n",
       "                         'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,\n",
       "       19]),\n",
       "                         'min_samples_leaf': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45])},\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='accuracy', verbose=True)"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param_grid = {\n",
    "    \"max_depth\": np.arange(2, 20),\n",
    "    \"criterion\": [\"gini\", \"entropy\"],\n",
    "    \"min_samples_leaf\": np.arange(1, 10) * 5\n",
    "}\n",
    "\n",
    "\n",
    "gsearch =model_selection.GridSearchCV(est, param_grid=param_grid, scoring=\"accuracy\"\n",
    "                                      , cv = 5, verbose = True, n_jobs= 8)\n",
    "gsearch.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1620"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "18 * 2 * 9 * 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(np.arange(2, 20))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7289999999999999"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'criterion': 'gini', 'max_depth': 11, 'min_samples_leaf': 10}"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train accuracy 0.8185714285714286\n",
      "test accuracy 0.7266666666666667\n"
     ]
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier(max_depth=11, min_samples_leaf=10, criterion=\"gini\")\n",
    "est.fit(X_train, y_train)\n",
    "y_train_pred = est.predict(X_train)\n",
    "y_test_pred = est.predict(X_test)\n",
    "print(\"train accuracy\", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy\", metrics.accuracy_score(y_test, y_test_pred))\n",
    "from sklearn.tree import export_graphviz\n",
    "export_graphviz(est, out_file = \"tree.dot\", feature_names = X.columns, filled=True)\n",
    "!dot -Tpng tree.dot -o tree.png"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 120 candidates, totalling 600 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.\n",
      "[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.2s\n",
      "[Parallel(n_jobs=8)]: Done 600 out of 600 | elapsed:    0.8s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score=nan,\n",
       "             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,\n",
       "                                              criterion='gini', max_depth=11,\n",
       "                                              max_features=None,\n",
       "                                              max_leaf_nodes=None,\n",
       "                                              min_impurity_decrease=0.0,\n",
       "                                              min_impurity_split=None,\n",
       "                                              min_samples_leaf=10,\n",
       "                                              min_samples_split=2,\n",
       "                                              min_weight_fraction_leaf=0.0,\n",
       "                                              presort='deprecated',\n",
       "                                              random_state=None,\n",
       "                                              splitter='best'),\n",
       "             iid='deprecated', n_jobs=8,\n",
       "             param_grid={'criterion': ['gini', 'entropy'],\n",
       "                         'max_depth': array([2, 3, 4, 5]),\n",
       "                         'min_samples_leaf': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])},\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='accuracy', verbose=True)"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param_grid = {\n",
    "    \"max_depth\": np.arange(2, 6),\n",
    "    \"criterion\": [\"gini\", \"entropy\"],\n",
    "    \"min_samples_leaf\": np.arange(5, 20)\n",
    "}\n",
    "\n",
    "\n",
    "gsearch =model_selection.GridSearchCV(est, param_grid=param_grid, scoring=\"accuracy\"\n",
    "                                      , cv = 5, verbose = True, n_jobs= 8)\n",
    "gsearch.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7685714285714286, 0.7228571428571429)"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch.score(X_train, y_train), gsearch.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 12}"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test_pred = est.predict_proba(X_test)[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_encoder = preprocessing.LabelEncoder()\n",
    "label_encoder.fit(y_train)\n",
    "y_test_le = label_encoder.transform(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "fpr, tpr, thresholds = metrics.roc_curve(y_test_le, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0, 0.5, 'TPR')"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3deXwV5dn/8c9FCKuELWFfArKjuBDBrW5QRdpKF6vSWpdabR9rN32sdnlsq+1Tq/66PbVV21qX1qqtrVKronWpKGIJgsiqkS0BQhJ2CJDt+v0xEz0eTkJIMpkk5/t+vc7rdWbuWa45Z85cc99znxlzd0REJH11iDsAERGJlxKBiEiaUyIQEUlzSgQiImlOiUBEJM0pEYiIpDklAhGRNKdE0ARmts7M9pnZHjMrNrP7zOyIpGlONrMXzGy3me00s3+Y2YSkabLM7OdmtiFcVkE4nN2yWyRNZWadzexeM9sV7hPXHmL6kWb2ZLh/lJnZbUnlF5nZSjPba2bvmtmHEsqmmdkqMys3sxfNbHhC2QVmNj8seylpmWPM7AkzKzWzbWY218zGJpSbmf3QzDaG++xLZjYxofwOM3snjHmVmV2SUJZtZq+a2VYz22Fmr5nZKQnll5rZovDzKTKz28ysY0L5+PD3sjP8HXwiKfYvhOP3mNkzZjYooex6M1sWxrXWzK5PmvdYM5sXLrvIzG5KKDvRzJ4LP49SM/uLmQ1MKH86XGftq8LM3grLhiWV7TEzN7Pr6vnqWxd316uRL2AdMD18PwB4E/hRQvlJwB7ga0APoA/wQ2A7MDKcphOwEHgOmECQnPsB/wPMjDD2jnF/fq391ZjPCPgxMA/oDYwHioEZdUzbCXgXuBboDnQBJiWUfxhYD5wY7heDgcFhWTawE/h0ON/twIKEeacDFwA3AS8lrXcKcEW4P2YCtwCrEsovADYBI4GMcJveSCj/ATAujGlquD+fHJZ1AcaGZQZ8HNhW+1kC/wV8KNz2wcAi4Mbazxt4O/w8MoCzgL3AmLD8dKAEmBjO/xvg3wlxfRM4PlzO2PCzuyihfAXwo3DZRwKbgfPCsnPDzzIL6AbcCzxTz/f8EnBTHWUjgGogN+59uMH7bdwBtOUXCYkgHL4N+GfC8Dzg1ynmexp4IHz/BWALcMRhrHciQeLYFs777XD8fcAPE6Y7AyhKivcGYClwAPgu8NekZf8C+GX4vifw+/AHs5EgiWU0MMbLgZXAbmAN8MWEssuAV5Kmd2BU+L4r8P/CH/JO4BWga4p1ZANPAjvCz2Ie0CEsGwr8DSgFtgK/Csd3CLd7fXhQeQDoGZblhnFcAWwAXg7HnwjMD9fzJnBGPdu9ETg7YfgW4OE6pr0KmFfPsuYDV9Qz7/yE4e7APmBc0nRfICkRpFhWn3C7+4bDNwCPJu1v++uZfw5wXYrxHYCPhcvuV8e81wL/CN8fRXDiZAnlzwK3hO/vAO5MKBsULvvIOpb9S+D/EobLgQkJw38BvlXHvMcDu+soyyU40I+oo/x7wIsN/T23hpeahpqJmQ0hOKsoCIe7AScT7GzJHiU424PgzO0Zd9/TwPX0AP4FPEPwQxgFPH8Yoc4GPgL0Ah4EZppZVrjsDIKzwYfCae8HqsJ1HAecTXBgaYgS4KMEZ1iXAz8zs+MbOO8dwGSCz68PwZleTYrprgOKgBygP/BtwMPteJLgYJ9LcOb5cDjPZeHrTIIz3iOAXyUt93SCs/lzzGww8E+CJNgH+G/gMTPLATCzG83syfB9b4Lv5M2EZb1JcCBN5URgXdjsUBY2wRwdLisDyANywqaQIjP7lZl1DeedmLged99LULuoa131OQ0odvet4fDDwKiwCSkTuJRgfztIGM8JwPKk8UuB/QRJ4nfuXlLPumvntVSrIEgQte8tqYyE8sT1G0HNIzGunwOXmFlm2BR2EsFv6VBxJbuEIIGvraf8/jrKWqe4M1FbfhGcYe8hOOt1ggNyr7BsSDhuXIr5ZgCV4fvngFsPY52zgcV1lN3HoWsEn0+a5xXgkvD9h4F3w/f9CWoNXZPW/WIjP6vHga+F7y+jjhoBwVnkPuCYBizzZuAJwppEwviTCGoCBzXthN/R1QnDY4FKguaE3DCOkQnlNwAPJi1jLnBpimUPDefvkjDuw8C6OuJ/Nlz3uQRNHdcT1J468f7Zbj4wkKD28yph0yNBTe3WpOW9ClyWNK7eGkG4n24EZieM60RQM3SCE4G11H32ez9BkrAUZV3Cfeagzyosv5wgkWeHw5nh9n8zfH82UAHMDcunAWXAJIJa490EJwizUyz7BwSJsnPCuJMJTtSqwm37QR1xTSKoYX6ojvKC5M85oexDBMeEBtfwW8NLNYKm+7i79yA46I4j+MFC0G5aQ/AjTjaQYIeGoNki1TR1GUpw5tdYhUnDDxH8WAE+w/u1geEEP8bN4UW/HQQ/vH4NWYmZnWtmC8KLbzuAmbz/2dQnm+AA0pBtvJ3gR/msma0xsxvD8UOB9e5elWKeQQQ1hVrrCZJA/4RxiZ/RcODTtZ9BuC2nkvo7q63VZSWMyyI4UUhlH0FCfNrdKwhqQn0JaiP7wmn+z903u3sZ8FOCz7F2XVlJy6tvXQcJazXPEjRf/jmh6HsEZ/lDCb6LHwAvhLXcxPlvJzgbv8DDo2Aid98fLvdGMzsmad6PA7cC54bbhrtXElxT+AjBtZXrCGrPRWH582FsjxF8b+vC7S1KWvY1BGflH3H3A+G4PgQJ6+Zwm4YS1PiuTpp3FEHT7dfcfV6Kz+xUguuBf00uC10KPOYNrOG3FkoEzcTd/01wRn5HOLwXeI3gAlSyC3i/OedfBDtk9wauqpDgQlcqewkudNUakCrUpOG/AGeETVuf4P1EUEhQI8h2917hK8vdD9n0YGadCX6sdwD93b0X8BTvV+U/EKeZJcZZRtCkUNc2vr8h7rvd/Tp3H0nQFn2tmU0LYx+W2BslwSaCg3utYQRniFsSF53wvpCgRtAr4dXd3W9NEc92guspiQe9Y6i7iWEpB38ficsqqqs8XOZ76wn3nyPrWdcHhM1YzwJz3P1HScXHAI+4e5G7V7n7fQQXvyckzP8DgprM2e6+6xCryyRohquddwbwW+Bj7v5W4oTuvtTdT3f3vu5+TjjffxLK73T30e7ej2Af6wgsS1j254EbgWnunpggRgLV7v5AuE1FBE1gMxPmHU7we7zF3R+sY1suBf6W6kAfNpN9mrbWLARqGmrKi4MvFucQHOSODYdPDYe/StBrqDdBW/MOYHQ4TWeCXkPP8H5PjL4E7d0H9RoKl7MZ+Ho4bw9galh2JbCKoC17ALCAg5uGpqdY5tMETVSLk8Y/QdBEkBXGdSRweliWS3CQyq0jxmqCtnYjOGCUEzZbAWMIksyxBGdnd/HBi8V3EiTKQQQ9PE4ioYqfsJ6PEjQnGcEZ3maCmlkGQbPAHbzfG+eUcJ4vAO8Q9Ow4guDM7o9J29QxYR1DCc5OzwmX2yVcx5A69olbgX+H3/W4MKa6eg2NDT+X6eGyv0FQE+oUlt8c7hv9wuXN4/0LpzkEF9I/Fcb0Ez7Ya6g21i8BL4fvM8OyLIKD66/qiOt7BE2G/cPv/XME+3Fts+e3ws9wYIp5TyTY7zsRNN/cQHDWPigsP4ugFnxaHeueFMbajeB6zNra7z4cf1T4fQ8j6Lnzvwnzfjb8rsanWG4Wwe/uM+E2DSA4UattahscfvbX1/N77xou46w6yj9DUFM5qJmstb9iD6Atv0hxYCXo0vZYwvCp4Q67B9hFcOHxqKR5ehJcyCoMp3uXoBmgbx3rPYrgQLk93PFru991AR4J17M0PLA0JBF8juAAeH2KuH5DcGa6E1hM2B2PoC10Xe3BJcUyv0xwlr2D4KL0w3zw+sV3CM7+C4GLObjX0M8J2q53EhzIUvUa+kYYw94wxv9JKBtGcF1ia7ie2p5QHQi6VBYSXEf4I9A7LMslKRGE46cSHNy3hfP8ExgWln0beDph2s4EXQ93hdt/bVJMe2rnDcd9kqB5a1e4n0xMKMsEfh1+hsUEvWASrz9MJ0j8+8J5cxPKLgu3JfF1X1h2aTi8N4yn9lW7TV0IkvHmMK43SEhm4bwHkuat7bl2OkES3h1+Xv8m4aAPvEhQA0ucN/Hzu51gv95DcIIyKqGsF8F+vTf8PH5MQi82gqRRmbTsuxLKzyJIrDvD+X8LdAvLvhduV+K8e5L2g9nUc6AnuHZ0S9zHpca8LNwAkcNiZt8FSt397rhjEZGmUSIQEUlzulgsIpLmlAhERNKcEoGISJpL1c+6VcvOzvbc3Ny4wxARaVMWLVpU5u45qcraXCLIzc0lPz8/7jBERNoUM1tfV5mahkRE0pwSgYhImlMiEBFJc0oEIiJpTolARCTNRZYILHiAd4mZLauj3Mzsl+HTl5YextOrRESkGUVZI7iP4ElcdTkXGB2+riK4y6WIiLSwyBKBu79McBvauswieIC7u/sCoJeZHc6TukRE2j13Z8WmXfzsubdZVXyoZwA1Tpx/KBvMBx8JWBSO25w8oZldRVBrYNiwYS0SnIhIXKprnEXrt/Ps8mLmriimcNs+zCC7R2fGDUh+QmnTxZkILMW4uh7bdw9wD0BeXp7umy0i7c6BqmrmF2xl7vJi/rVyC2V7KsjMME4Zlc3VZ4xi+vj+5PToHMm640wERQSPAaw1hOB5siIiaWH3/kpeXF3Ks8uLeWl1KXsOVNG9UwZnjOvHORMHcObYHHp0yYw8jjgTwRzgGjN7mOBRgDvd/aBmIRGR9qR09wH+tXILc5cXM79gKxXVNfTt3omPThrIORMHcNKRfemSmdGiMUWWCMzszwQP+c42syKCZ4JmArj7XcBTwEyC57WWA5dHFYuISJw2bC1n7vJinl1RTP767bjDkN5d+dxJwzln4gAmD+9NRodUreUtI7JE4O6zD1HuBA84FxFpV9ydlZt3M3d5MXOXF7OqeDcA4wb04Ktnjebsif2ZMDALs/gO/ona3G2oRURao7p6+kwe1pvvzBzP2RP7M7xv97jDTEmJQESkkeLs6dOclAhERA7D7v2VvLS6lLkx9/RpTkoEIiKH0Bp7+jQnJQIRkRQ2bC3n2RXBxd7W2NOnOSkRiIjQ9nr6NCclAhFJW225p09zUiIQkbTSXnr6NCclAhFp99pjT5/mpEQgIu1Se+/p05yUCESk3Uinnj7NSYlARNqsdO7p05yUCESkTVFPn+anRCAirZ56+kRLiUBEWiX19Gk5SgQi0mqop088lAhEJFbq6RM/JQIRaVHq6dP6KBGISOSqa5w3Nmxn7jL19GmNlAhEJBLq6dN2KBGISLNRT5+2SYlARJpEPX3aPiUCETlshdvK37vYq54+bZ8SgYgcknr6tG9KBCKSknr6pA8lAhF5j3r6pCclApE0p54+okQgkobU00cSKRGIpAn19JG6KBGItFPq6SMNpUQg0o6op480RqSJwMxmAL8AMoDfufutSeXDgPuBXuE0N7r7U1HGJNLeqKePNFVkicDMMoA7gQ8DRcBCM5vj7isSJvsu8Ki7/8bMJgBPAblRxSTSnrzyThkPL9ygnj7SZFHWCKYABe6+BsDMHgZmAYmJwIGs8H1PYFOE8Yi0CzU1zs+ff4dfPv+OevpIs4gyEQwGChOGi4CpSdN8H3jWzL4CdAemp1qQmV0FXAUwbNiwZg9UpK3Yvb+Sax99k+dWbOH8yUP44ceP0sFfmqxDhMtO1RXBk4ZnA/e5+xBgJvCgmR0Uk7vf4+557p6Xk5MTQagird/asr184tfzeWFVCd/72ARuP3+SkoA0iyhrBEXA0IThIRzc9HMFMAPA3V8zsy5ANlASYVwibc5Lq0v4yp8X07GD8eDnp3DyqOy4Q5J2JMoawUJgtJmNMLNOwEXAnKRpNgDTAMxsPNAFKI0wJpE2ZUd5BT98cgWX37eQwb26MueaU5UEpNlFViNw9yozuwaYS9A19F53X25mNwP57j4HuA74rZl9g6DZ6DJ3T24+Ekk7+yqq+cP8tfzmpXfZe6CKC/OGctPHJtCtk/76I80v0r0q/E/AU0njbkp4vwI4JcoYRNqaf63YwncfX0bxrv1MH9+P688Zx9gBPeIOS9oxnV6ItCI1Nc6Nf3uLXt0yeeSiE5k6sm/cIUkaiPIagYgcpsWFOyjbc4CvnDVKSUBajBKBSCvy7IpiOnYwzhjbL+5QJI0oEYi0Is+t2MLUkX3o2VW3h5CWo0Qg0kq8W7qHNaV7OXvCgLhDkTSjRCDSSjy3YgsA0yf0jzkSSTdKBCKtxHMrtjBxUBaDe3WNOxRJM0oEIq1A6e4DvLFhOx9WbUBioEQg0go8v3IL7uj6gMRCiUCkFXhuxRYG9+rK+IH6B7G0PCUCkZiVV1TxSkEZH57QXw+Sl1goEYjE7OW3yzhQVcPZuj4gMVEiEInZsyuKyerSkRNG9Ik7FElTSgQiMSqvqOKFVSVMG9+fzAz9HCUe2vNEYuLufOfvy9i5r5KLThh66BlEIqJEIBKTP76+gb8v3sg3po/RnUYlVkoEIjFYUriDm/+xnDPG5nDNmaPiDkfSnBKBSAvbtreCq/+4iH49uvDzC4+lQwd1GZV46QllIi2ousb52sOLKdtbwWNfOple3TrFHZKIagQiLekXz7/DvHfKuPm8iRw9pGfc4YgASgQiLebFVSX88vl3+PTkIVyoXkLSiigRiLSAwm3lfP2RJYwfmMUtHz9Kt5KQVkWJQCRiu/dXcvWf3qDGnbsuPp4umRlxhyTyAbpYLBKhNaV7uPKBfNZtLefuiyczvG/3uEMSOYgSgchh2HugindK9lBeUXXIaTfv2M/3/7GczIwO/PGKqZx0pP40Jq2TEoFICjU1zoZt5awq3sXKzbtZVbyL1cW7Wb+tHPeGL2fCwCzu/txkhvbpFl2wIk2kRCBpb+e+SlYX737voL9y8y7e3rKb8opqAMxgRN/uTBiUxSePH8LYAT3o2TXzkMvtYMakIT11TUBaPSUCSVt7D1Tx5Yfe4KXVpe+N69Utk/EDsrjwhKGMH5DFuIE9GN2vB1076WAu7ZcSgaSlXfsr+fwfFrK4cAdfOWsUk4f3ZvzALPr16KyunZJ2lAgk7ewsr+SSe19n+aZd/Gr2cZx79MC4QxKJVaT/IzCzGWa22swKzOzGOqa5wMxWmNlyM3soynhEtu2tYPZvF7By827uuniykoAIEdYIzCwDuBP4MFAELDSzOe6+ImGa0cC3gFPcfbuZ9YsqHpHS3Qf47O8WsH5rOb+9NI/Tx+TEHZJIqxBljWAKUODua9y9AngYmJU0zZXAne6+HcDdSyKMR9JY8c79XHjPaxRu28cfLj9BSUAkQZSJYDBQmDBcFI5LNAYYY2avmtkCM5uRakFmdpWZ5ZtZfmlpaapJROpUtL2cC+5+jZJdB3jgiimcfGR23CGJtCpRJoJUXS+S/4rTERgNnAHMBn5nZr0Omsn9HnfPc/e8nBydyUnDPbOsmE/8ej47yiv44xemckJun7hDEml1ouw1VAQk3mt3CLApxTQL3L0SWGtmqwkSw8II45I0ULJ7P9+fs5yn3ipmwsAsfnrhMYwbkBV3WCKtUpSJYCEw2sxGABuBi4DPJE3zOEFN4D4zyyZoKloTYUzSzrk7f11UxA//uZJ9ldV8c8ZYrvzQSDIzdKNdkbpElgjcvcrMrgHmAhnAve6+3MxuBvLdfU5YdraZrQCqgevdfWtUMUn7VritnG///S3mvVPGCbm9ufVTkzgy54i4wxJp9cwP5w5arUBeXp7n5+fHHYa0Iu7O/fPX8ZNnVtPB4MaZ4/nslGF6KLxIAjNb5O55qcr0z2Jp0/ZVVPPNx5byjzc3cebYHH70iaMZ1Ktr3GGJtClKBNJmFW4r54sPLmJl8S5umDGOL50+UvcJEmkEJQJpk157dytffugNKqtruPeyEzhzrP6ULtJYSgTSptReD7jlnysZkd2dez43mZG6ICzSJEoE0mbsr6zmfx5fxl8WFTF9fH9+duEx9Ohy6AfEiEj9lAikTdiyaz9ffHARSwp38NVpo/n6tNHqFSTSTA47EYR3Fb3I3f8UQTwiH1Cyaz8vrCrhp8+9zZ4DVdx18fHMOEq3jhZpTnUmAjPLAr5McKO4OcBzwDXAfwNLACUCaXY1Nc7SjTt5YVUJL64q4a2NOwEY1e8IHrxiKmMH9Ig5QpH2p74awYPAduA14AvA9UAnYJa7L2mB2CRN7Npfyby3y3hhVQn/fruEsj0VdDA4flhvrj9nLNPG92Ns/x7qGioSkfoSwUh3PxrAzH4HlAHD3H13i0Qm7Za7827pXl5cVcILq0pYuG4bVTVOz66ZnD4mh7PG9eP0MTn07t4p7lBF0kJ9iaCy9o27V5vZWiUBaawDVdW8vmYbL4QH/w3bygEY278HX/jQSKaN78dxQ3vRUTeHE2lx9SWCY8xsF+8/V6BrwrC7u+7pK3Xatb+SN9ZvZ9H67eSv286Swh3sq6ymc8cOnHxkX648bSRnjs1hSO9ucYcqkvbqTATuntGSgUjb5e4Ubd9H/vpt5K8LDv6rt+zGHToYTBiUxYUnDOW0MdmcNDKbrp20a4m0JvX1GuoCfAkYBSwluI10VUsFJq1bTY0zr6CMxxYV8frarWzZdQCAIzp35LhhvTj3qIHk5fbm2KG96N5Zf1cRac3q+4XeT3CdYB4wE5gIfK0lgpLWq3T3Af6yqJA//2cDhdv20ad7J04dlU1ebm8mD+/NuAFZZOiPXiJtSn2JYEJCr6HfA/9pmZCktXF3Xnt3K396fQPPriimsto5cWQfrj9nHOdM7E/njmrqEWnLGtprqEp9uNPPtr0VPLaoiIf+s4G1ZXvp2TWTS07KZfaUYYzqpxu9ibQX9SWCY8NeQhD0FFKvoTTg7ixct52HXl/PU28VU1Fdw+ThvfnKWaOYefRAumTq7F+kvakvEbzp7se1WCQSuz0HqvjvR9/kmeXF9OjckYumDOUzU4cxboByvkh7Vl8iaFsPM5YmWVe2l6sezKegZA/fnDGWy07OpVsn9fYRSQf1/dL7mdm1dRW6+08jiEcisnDdNu6Yu5rK6pqU5e+U7CGjg/HA56dy6ujsFo5OROJUXyLIAI7g/X8WSxtVXlHF1x9eQmV1TZ137zxtdA43zBjHsL76p69IuqkvEWx295tbLBKJzC+ef4eNO/bx6BdPYsqIPnGHIyKtTH13+FJNoB1YVbyL389bywV5Q5QERCSl+hLBtBaLQiJRU+N89+/L6NGlI986d3zc4YhIK1VnInD3bS0ZiDS/R/MLyV+/nW/PHK97+4tInXTz93Zq654D/PjpVUwd0YfzJw+JOxwRacXUUTwG8wvK+Pm/ggu4USmvqKK8oooffeIoPeJRROqlRNCClm/ayU+eWc3Lb5cyqGcXTjoymyiP0dPG9WNUPz3sXUTqp0TQAg5UVfOtv73F397YSK9umXz3I+O5+MThum+PiLQKSgQt4P756/jbGxv54mkjufrMUfTsmhl3SCIi74n0YrGZzTCz1WZWYGY31jPd+WbmZpYXZTxx2FleyZ0vvsvpY3L41szxSgIi0upElgjMLAO4EzgXmADMNrMJKabrAXwVeD2qWOL0638XsGt/JTfMGBd3KCIiKUVZI5gCFLj7GnevAB4GZqWY7hbgNmB/hLHEYtOOffzh1XV84tjBTBikWzmLSOsUZSIYDBQmDBeF495jZscBQ939yfoWZGZXmVm+meWXlpY2f6QR+dlzb4PDtWePiTsUEZE6RZkIUnWMfO8ZB2bWAfgZcN2hFuTu97h7nrvn5eTkNGOI0VldvJvH3ijikpOGM6S37ugpIq1XlImgCBiaMDwE2JQw3AM4CnjJzNYBJwJz2ssF4588s4runTvy5TNHxR2KiEi9okwEC4HRZjbCzDoBFwFzagvdfae7Z7t7rrvnAguA89w9P8KYWsSCNVt5YVUJV58xSvf4EZFWL7JE4O5VwDXAXGAl8Ki7Lzezm83svKjWGzd359anVzEgqwuXn5IbdzgiIocU6R/K3P0p4KmkcTfVMe0ZUcbSUp5ZVsySwh3c9qlJ+uewiLQJuvtoM6qsruG2uasZ3e8IPnn84EPPICLSCigRNKNHFhaytmwvN8wYR8cMfbQi0jboaNVM9h6o4uf/eocpuX2YNr5f3OGIiDSYEkEz+d28tZTtOcAN547T/f9FpE1RImgGZXsOcM/L7zJj4gAmD+8ddzgiIodFiaAZ/N/z77C/qobrZ4yNOxQRkcOmRNBE67fu5U+vb+DCE4ZyZM4RcYcjInLYlAia6Pa5q8nM6MDXp42OOxQRkUZRImiCrXsO8OTSzVx6ci79srrEHY6ISKMoETTBvspqAEbmdI85EhGRxlMiEBFJc0oEIiJpTolARCTNKRGIiKQ5JQIRkTSnRCAikuaUCERE0pwSgYhImlMiEBFJc0oETeAedwQiIk2nRNAEb23cCcCQXl1jjkREpPGUCJrg8cUbyenRmakj+8YdiohIoykRNNLO8kpeWl3KxyYNIqODHk0pIm2XEkEjPb1sMxXVNXz8uEFxhyIi0iRKBI30xJJNjMjuztGDe8YdiohIkygRNELxzv0sWLuVWccOwkzNQiLStikRNMI/3tyEO8w6dnDcoYiINJkSQSM8vmQjxwzpyYhsPZlMRNo+JYLDVFCym+WbdnGeagMi0k4oERymJ5ZsooPBxyYNjDsUEZFmEWkiMLMZZrbazArM7MYU5dea2QozW2pmz5vZ8CjjaSp354klmzj5yGz6ZXWJOxwRkWYRWSIwswzgTuBcYAIw28wmJE22GMhz90nAX4HbooqnOSwu3MGGbeXMOlb/HRCR9iPKGsEUoMDd17h7BfAwMCtxAnd/0d3Lw8EFwJAI42myJxZvpFPHDpxz1IC4QxERaTZRJoLBQGHCcFE4ri5XAE+nKjCzq8ws38zyS0tLmzHEhquqruHJpZuZPr4fWV0yY4lBRCQKUSaCVP+0SnnjZjO7GMgDbk9V7u73uHueu+fl5OQ0Y4gN90pBGVv3VnDeMeotJCLtS8cIl10EDE0YHgJsSp7IzKYD3wFOd/cDEcMN3ukAAAlSSURBVMbTJHOWbKJHl46cOS6eRCQiEpUoawQLgdFmNsLMOgEXAXMSJzCz44C7gfPcvSTCWJpkZ3klTy3bzEcnDaRzx4y4wxERaVaRJQJ3rwKuAeYCK4FH3X25md1sZueFk90OHAH8xcyWmNmcOhYXq7++UcT+yhouPrFV924VEWmUKJuGcPengKeSxt2U8H56lOtvDu7Onxas57hhvZg4SHcaFZH2R/8sPoT5725lTdlePqfagIi0U0oEh/Dga+vp3S2TmUfrlhIi0j4pEdSjeOd+nlu5hQtOGEqXTF0kFpH2SYmgHve/to4adz47Rc1CItJ+KRHU4cXVJdz973eZdcwghvXtFnc4IiKRUSJIoaBkD199aDFjB2Txv588Ou5wREQipUSQZGd5JVc+kE+njh347SWT6dYp0h62IiKx01EuQVV1DV9+6A2Ktpfz5ytPZEhvNQmJSPunRJDgh/9cySsFZdz2qUnk5faJOxwRkRahpqHQn/+zgfvmr+Pzp4zgghOGHnoGEZF2QokAeHvLbv7n8WWcNiaHb88cF3c4IiItSokAWFW8m6oa59szx9ExQx+JiKQXHfUSdOygj0NE0o+OfCIiaU6JQEQkzSkRiIikOSUCEZE0l7Z/KNuyaz/n3zWfPfurqKiqAcAs5qBERGKQtonghVUlFG7bxwV5Q+iSmUHvbp3I7ds97rBERFpc2iaCVwrKGJDVhZ98ahKmqoCIpLG0vEZQU+PMLyjjlFHZSgIikvbSMhGs2LyL7eWVnDq6b9yhiIjELi0TwasFZQCccmR2zJGIiMQvLRPBKwVljOl/BP2yusQdiohI7NIuEeyvrGbhum2cMkq1ARERSMNE8MaG7eyvrOFUJQIRESANE8GrBWVkdDCmjtSFYhERSMNE8ErBVo4d2osjOqftXyhERD4grRLBzn2VvFW0Q9cHREQSpFUiWLBmKzWOrg+IiCRIq0TwakEZ3TplcOzQXnGHIiLSakSaCMxshpmtNrMCM7sxRXlnM3skLH/dzHKjjOeVgjKmjuhDp45plf9EROoV2RHRzDKAO4FzgQnAbDObkDTZFcB2dx8F/Az4SVTxbNqxjzWle3V9QEQkSZSnxlOAAndf4+4VwMPArKRpZgH3h+//CkyziO4CV3tbiVNHKxGIiCSKMhEMBgoThovCcSmncfcqYCdwUAd/M7vKzPLNLL+0tLRRwfTsmsnZE/oztn+PRs0vItJeRdmZPtWZvTdiGtz9HuAegLy8vIPKG+LsiQM4e+KAxswqItKuRVkjKAKGJgwPATbVNY2ZdQR6AtsijElERJJEmQgWAqPNbISZdQIuAuYkTTMHuDR8fz7wgrs36oxfREQaJ7KmIXevMrNrgLlABnCvuy83s5uBfHefA/weeNDMCghqAhdFFY+IiKQW6Q133P0p4KmkcTclvN8PfDrKGEREpH76Z5WISJpTIhARSXNKBCIiaU6JQEQkzVlb661pZqXA+kbOng2UNWM4bYG2OT1om9NDU7Z5uLvnpCpoc4mgKcws393z4o6jJWmb04O2OT1Etc1qGhIRSXNKBCIiaS7dEsE9cQcQA21zetA2p4dItjmtrhGIiMjB0q1GICIiSZQIRETSXLtMBGY2w8xWm1mBmd2YoryzmT0Slr9uZrktH2XzasA2X2tmK8xsqZk9b2bD44izOR1qmxOmO9/M3MzafFfDhmyzmV0QftfLzeyhlo6xuTVg3x5mZi+a2eJw/54ZR5zNxczuNbMSM1tWR7mZ2S/Dz2OpmR3f5JW6e7t6Edzy+l1gJNAJeBOYkDTN1cBd4fuLgEfijrsFtvlMoFv4/r/SYZvD6XoALwMLgLy4426B73k0sBjoHQ73izvuFtjme4D/Ct9PANbFHXcTt/k04HhgWR3lM4GnCZ7weCLwelPX2R5rBFOAAndf4+4VwMPArKRpZgH3h+//Ckwzs1SPzWwrDrnN7v6iu5eHgwsInhjXljXkewa4BbgN2N+SwUWkIdt8JXCnu28HcPeSFo6xuTVkmx3ICt/35OAnIbYp7v4y9T+pcRbwgAcWAL3MbGBT1tkeE8FgoDBhuCgcl3Iad68CdgJ9WyS6aDRkmxNdQXBG0ZYdcpvN7DhgqLs/2ZKBRagh3/MYYIyZvWpmC8xsRotFF42GbPP3gYvNrIjg+SdfaZnQYnO4v/dDivTBNDFJdWaf3Ee2IdO0JQ3eHjO7GMgDTo80oujVu81m1gH4GXBZSwXUAhryPXckaB46g6DWN8/MjnL3HRHHFpWGbPNs4D53/39mdhLBUw+Pcvea6MOLRbMfv9pjjaAIGJowPISDq4rvTWNmHQmqk/VVxVq7hmwzZjYd+A5wnrsfaKHYonKobe4BHAW8ZGbrCNpS57TxC8YN3befcPdKd18LrCZIDG1VQ7b5CuBRAHd/DehCcHO29qpBv/fD0R4TwUJgtJmNMLNOBBeD5yRNMwe4NHx/PvCCh1dh2qhDbnPYTHI3QRJo6+3GcIhtdved7p7t7rnunktwXeQ8d8+PJ9xm0ZB9+3GCjgGYWTZBU9GaFo2yeTVkmzcA0wDMbDxBIiht0Shb1hzgkrD30InATnff3JQFtrumIXevMrNrgLkEPQ7udfflZnYzkO/uc4DfE1QfCwhqAhfFF3HTNXCbbweOAP4SXhff4O7nxRZ0EzVwm9uVBm7zXOBsM1sBVAPXu/vW+KJumgZu83XAb83sGwRNJJe15RM7M/szQdNednjd43tAJoC730VwHWQmUACUA5c3eZ1t+PMSEZFm0B6bhkRE5DAoEYiIpDklAhGRNKdEICKS5pQIRETSnBKBSAOZWbWZLUl45ZrZGWa2M7zz5Uoz+144beL4VWZ2R9zxi9Sl3f2PQCRC+9z92MQR4S3M57n7R82sO7DEzGrvbVQ7viuw2Mz+7u6vtmzIIoemGoFIM3H3vcAi4Mik8fuAJTTxxmAiUVEiEGm4rgnNQn9PLjSzvgT3NFqeNL43wf1+Xm6ZMEUOj5qGRBruoKah0IfMbDFQA9wa3gLhjHD8UmBsOL64BWMVaTAlApGmm+fuH61rvJmNAV4JrxEsaengRA5FTUMiEXP3t4EfAzfEHYtIKkoEIi3jLuA0MxsRdyAiyXT3URGRNKcagYhImlMiEBFJc0oEIiJpTolARCTNKRGIiKQ5JQIRkTSnRCAikub+P8Un63vttH9vAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "auc = metrics.auc(fpr, tpr)\n",
    "plt.plot(fpr, tpr)\n",
    "plt.title(\"ROC curve, auc score:\" + str(auc))\n",
    "plt.xlabel(\"FPR\")\n",
    "plt.ylabel(\"TPR\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.14401518, 0.1239812 , 0.06727951, 0.05809262, 0.01162033,\n",
       "       0.01539952, 0.        , 0.        , 0.02555282, 0.22336633,\n",
       "       0.00851682, 0.03585933, 0.02596523, 0.0528968 , 0.01313525,\n",
       "       0.01886405, 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.04727327,\n",
       "       0.        , 0.00262122, 0.00959882, 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.03595749,\n",
       "       0.01132407, 0.02695599, 0.        , 0.        , 0.0253872 ,\n",
       "       0.        , 0.01633697, 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        ])"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "est.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature 1 - purpose\n",
    "purpose = v1 -> IG1\n",
    "purpose = v2 -> IG2\n",
    "..\n",
    "\n",
    "feature 2 - age\n",
    "age < 30   -> IG3\n",
    "age < 40   -> IG4\n",
    "\n",
    "..\n",
    "\n",
    "\n",
    "\n",
    "feature1 (purpose)-> sum(IG1, Ig2 ... )\n",
    "feature2 (age)-> sum(IG3,IG4  ... )\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "est.feature_importances_.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['months_loan_duration', 'amount', 'installment_rate',\n",
       "       'residence_history', 'age', 'existing_credits', 'default', 'dependents',\n",
       "       'checking_balance_< 0 DM', 'checking_balance_> 200 DM',\n",
       "       'checking_balance_unknown', 'credit_history_delayed',\n",
       "       'credit_history_fully repaid', 'credit_history_fully repaid this bank',\n",
       "       'credit_history_repaid', 'purpose_car (new)', 'purpose_car (used)',\n",
       "       'purpose_domestic appliances', 'purpose_education', 'purpose_furniture',\n",
       "       'purpose_others', 'purpose_radio/tv', 'purpose_repairs',\n",
       "       'purpose_retraining', 'savings_balance_501 - 1000 DM',\n",
       "       'savings_balance_< 100 DM', 'savings_balance_> 1000 DM',\n",
       "       'savings_balance_unknown', 'employment_length_1 - 4 yrs',\n",
       "       'employment_length_4 - 7 yrs', 'employment_length_> 7 yrs',\n",
       "       'employment_length_unemployed', 'personal_status_female',\n",
       "       'personal_status_married male', 'personal_status_single male',\n",
       "       'other_debtors_guarantor', 'other_debtors_none', 'property_other',\n",
       "       'property_real estate', 'property_unknown/none',\n",
       "       'installment_plan_none', 'installment_plan_stores', 'housing_own',\n",
       "       'housing_rent', 'telephone_yes', 'foreign_worker_yes',\n",
       "       'job_skilled employee', 'job_unemployed non-resident',\n",
       "       'job_unskilled resident'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_dummy.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['months_loan_duration', 'amount', 'installment_rate',\n",
       "       'residence_history', 'age', 'existing_credits', 'dependents',\n",
       "       'checking_balance_< 0 DM', 'checking_balance_> 200 DM',\n",
       "       'checking_balance_unknown', 'credit_history_delayed',\n",
       "       'credit_history_fully repaid', 'credit_history_fully repaid this bank',\n",
       "       'credit_history_repaid', 'purpose_car (new)', 'purpose_car (used)',\n",
       "       'purpose_domestic appliances', 'purpose_education', 'purpose_furniture',\n",
       "       'purpose_others', 'purpose_radio/tv', 'purpose_repairs',\n",
       "       'purpose_retraining', 'savings_balance_501 - 1000 DM',\n",
       "       'savings_balance_< 100 DM', 'savings_balance_> 1000 DM',\n",
       "       'savings_balance_unknown', 'employment_length_1 - 4 yrs',\n",
       "       'employment_length_4 - 7 yrs', 'employment_length_> 7 yrs',\n",
       "       'employment_length_unemployed', 'personal_status_female',\n",
       "       'personal_status_married male', 'personal_status_single male',\n",
       "       'other_debtors_guarantor', 'other_debtors_none', 'property_other',\n",
       "       'property_real estate', 'property_unknown/none',\n",
       "       'installment_plan_none', 'installment_plan_stores', 'housing_own',\n",
       "       'housing_rent', 'telephone_yes', 'foreign_worker_yes',\n",
       "       'job_skilled employee', 'job_unemployed non-resident',\n",
       "       'job_unskilled resident'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "importance = pd.DataFrame({\"feature\": X.columns, \"importance\": est.feature_importances_})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>months_loan_duration</td>\n",
       "      <td>0.144015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>amount</td>\n",
       "      <td>0.123981</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>installment_rate</td>\n",
       "      <td>0.067280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>residence_history</td>\n",
       "      <td>0.058093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>age</td>\n",
       "      <td>0.011620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>existing_credits</td>\n",
       "      <td>0.015400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>dependents</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>checking_balance_&lt; 0 DM</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>checking_balance_&gt; 200 DM</td>\n",
       "      <td>0.025553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>checking_balance_unknown</td>\n",
       "      <td>0.223366</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>credit_history_delayed</td>\n",
       "      <td>0.008517</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>credit_history_fully repaid</td>\n",
       "      <td>0.035859</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>credit_history_fully repaid this bank</td>\n",
       "      <td>0.025965</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>credit_history_repaid</td>\n",
       "      <td>0.052897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>purpose_car (new)</td>\n",
       "      <td>0.013135</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>purpose_car (used)</td>\n",
       "      <td>0.018864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>purpose_domestic appliances</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>purpose_education</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>purpose_furniture</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>purpose_others</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>purpose_radio/tv</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>purpose_repairs</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>purpose_retraining</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>savings_balance_501 - 1000 DM</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>savings_balance_&lt; 100 DM</td>\n",
       "      <td>0.047273</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>savings_balance_&gt; 1000 DM</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>savings_balance_unknown</td>\n",
       "      <td>0.002621</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>employment_length_1 - 4 yrs</td>\n",
       "      <td>0.009599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>employment_length_4 - 7 yrs</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>employment_length_&gt; 7 yrs</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>employment_length_unemployed</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>personal_status_female</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>personal_status_married male</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>personal_status_single male</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>other_debtors_guarantor</td>\n",
       "      <td>0.035957</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>other_debtors_none</td>\n",
       "      <td>0.011324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>property_other</td>\n",
       "      <td>0.026956</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>property_real estate</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>property_unknown/none</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>installment_plan_none</td>\n",
       "      <td>0.025387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>installment_plan_stores</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>housing_own</td>\n",
       "      <td>0.016337</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>housing_rent</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>telephone_yes</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>foreign_worker_yes</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>job_skilled employee</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>job_unemployed non-resident</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>job_unskilled resident</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                  feature  importance\n",
       "0                    months_loan_duration    0.144015\n",
       "1                                  amount    0.123981\n",
       "2                        installment_rate    0.067280\n",
       "3                       residence_history    0.058093\n",
       "4                                     age    0.011620\n",
       "5                        existing_credits    0.015400\n",
       "6                              dependents    0.000000\n",
       "7                 checking_balance_< 0 DM    0.000000\n",
       "8               checking_balance_> 200 DM    0.025553\n",
       "9                checking_balance_unknown    0.223366\n",
       "10                 credit_history_delayed    0.008517\n",
       "11            credit_history_fully repaid    0.035859\n",
       "12  credit_history_fully repaid this bank    0.025965\n",
       "13                  credit_history_repaid    0.052897\n",
       "14                      purpose_car (new)    0.013135\n",
       "15                     purpose_car (used)    0.018864\n",
       "16            purpose_domestic appliances    0.000000\n",
       "17                      purpose_education    0.000000\n",
       "18                      purpose_furniture    0.000000\n",
       "19                         purpose_others    0.000000\n",
       "20                       purpose_radio/tv    0.000000\n",
       "21                        purpose_repairs    0.000000\n",
       "22                     purpose_retraining    0.000000\n",
       "23          savings_balance_501 - 1000 DM    0.000000\n",
       "24               savings_balance_< 100 DM    0.047273\n",
       "25              savings_balance_> 1000 DM    0.000000\n",
       "26                savings_balance_unknown    0.002621\n",
       "27            employment_length_1 - 4 yrs    0.009599\n",
       "28            employment_length_4 - 7 yrs    0.000000\n",
       "29              employment_length_> 7 yrs    0.000000\n",
       "30           employment_length_unemployed    0.000000\n",
       "31                 personal_status_female    0.000000\n",
       "32           personal_status_married male    0.000000\n",
       "33            personal_status_single male    0.000000\n",
       "34                other_debtors_guarantor    0.035957\n",
       "35                     other_debtors_none    0.011324\n",
       "36                         property_other    0.026956\n",
       "37                   property_real estate    0.000000\n",
       "38                  property_unknown/none    0.000000\n",
       "39                  installment_plan_none    0.025387\n",
       "40                installment_plan_stores    0.000000\n",
       "41                            housing_own    0.016337\n",
       "42                           housing_rent    0.000000\n",
       "43                          telephone_yes    0.000000\n",
       "44                     foreign_worker_yes    0.000000\n",
       "45                   job_skilled employee    0.000000\n",
       "46            job_unemployed non-resident    0.000000\n",
       "47                 job_unskilled resident    0.000000"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>months_loan_duration</td>\n",
       "      <td>0.144015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>amount</td>\n",
       "      <td>0.123981</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>installment_rate</td>\n",
       "      <td>0.067280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>residence_history</td>\n",
       "      <td>0.058093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>age</td>\n",
       "      <td>0.011620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>existing_credits</td>\n",
       "      <td>0.015400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>dependents</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>checking_balance_&lt; 0 DM</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>checking_balance_&gt; 200 DM</td>\n",
       "      <td>0.025553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>checking_balance_unknown</td>\n",
       "      <td>0.223366</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     feature  importance\n",
       "0       months_loan_duration    0.144015\n",
       "1                     amount    0.123981\n",
       "2           installment_rate    0.067280\n",
       "3          residence_history    0.058093\n",
       "4                        age    0.011620\n",
       "5           existing_credits    0.015400\n",
       "6                 dependents    0.000000\n",
       "7    checking_balance_< 0 DM    0.000000\n",
       "8  checking_balance_> 200 DM    0.025553\n",
       "9   checking_balance_unknown    0.223366"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "importance.iloc[:10, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training accuracy:  0.8271428571428572\n",
      "test accuracy:  0.7433333333333333\n",
      "training precision:  0.8067226890756303\n",
      "test precision:  0.7509157509157509\n",
      "training recall:  0.9876543209876543\n",
      "test recall:  0.9579439252336449\n"
     ]
    }
   ],
   "source": [
    "forest = ensemble.RandomForestClassifier(max_depth=6, n_estimators=50, )\n",
    "forest.fit(X_train, y_train)\n",
    "\n",
    "\n",
    "y_train_pred = forest.predict(X_train)\n",
    "y_test_pred = forest.predict(X_test)\n",
    "\n",
    "print(\"training accuracy: \", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy: \", metrics.accuracy_score(y_test, y_test_pred))\n",
    "print(\"training precision: \", metrics.precision_score(y_train, y_train_pred))\n",
    "print(\"test precision: \", metrics.precision_score(y_test, y_test_pred))\n",
    "print(\"training recall: \", metrics.recall_score(y_train, y_train_pred))\n",
    "print(\"test recall: \", metrics.recall_score(y_test, y_test_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.1012958 , 0.09682539, 0.03229096, 0.01784331, 0.07696492,\n",
       "       0.01794807, 0.00650361, 0.07494516, 0.00786617, 0.13197544,\n",
       "       0.01225512, 0.03187523, 0.0103532 , 0.01512056, 0.01852747,\n",
       "       0.01660301, 0.00219783, 0.00801843, 0.00745262, 0.00094362,\n",
       "       0.01404989, 0.00919839, 0.00015451, 0.00255661, 0.02811401,\n",
       "       0.00788001, 0.02144612, 0.00832382, 0.00725275, 0.01458451,\n",
       "       0.01335561, 0.01276311, 0.00606711, 0.0126112 , 0.0089155 ,\n",
       "       0.00606123, 0.00881428, 0.01479555, 0.01289806, 0.0267686 ,\n",
       "       0.00824498, 0.01707735, 0.01445719, 0.01346965, 0.0030963 ,\n",
       "       0.00893963, 0.00194125, 0.00835686])"
      ]
     },
     "execution_count": 142,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "forest.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>months_loan_duration</td>\n",
       "      <td>0.101296</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>amount</td>\n",
       "      <td>0.096825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>installment_rate</td>\n",
       "      <td>0.032291</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>residence_history</td>\n",
       "      <td>0.017843</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>age</td>\n",
       "      <td>0.076965</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>existing_credits</td>\n",
       "      <td>0.017948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>dependents</td>\n",
       "      <td>0.006504</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>checking_balance_&lt; 0 DM</td>\n",
       "      <td>0.074945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>checking_balance_&gt; 200 DM</td>\n",
       "      <td>0.007866</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>checking_balance_unknown</td>\n",
       "      <td>0.131975</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     feature  importance\n",
       "0       months_loan_duration    0.101296\n",
       "1                     amount    0.096825\n",
       "2           installment_rate    0.032291\n",
       "3          residence_history    0.017843\n",
       "4                        age    0.076965\n",
       "5           existing_credits    0.017948\n",
       "6                 dependents    0.006504\n",
       "7    checking_balance_< 0 DM    0.074945\n",
       "8  checking_balance_> 200 DM    0.007866\n",
       "9   checking_balance_unknown    0.131975"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "importance = pd.DataFrame({\"feature\": X.columns, \"importance\": forest.feature_importances_ }) \n",
    "importance.iloc[:10,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1776766514, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1087178500, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=664121568, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=183752352, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1702127644, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=756443523, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=697393087, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=685524680, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1383422173, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=956251155, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=721416188, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1288296911, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=961329987, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=519422218, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=821855837, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1913304890, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=2119238008, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=383506554, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=827006483, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=2129796894, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1678675055, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1822692151, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=801678338, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=220500646, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=69059735, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=737010616, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1875743282, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1127598631, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1480074863, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1865260314, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1390215547, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1661539180, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1001328911, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1373932204, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1171376576, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1296032508, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=194332320, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=891901763, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=647258859, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=478834039, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=563784379, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=558345168, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=368695702, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=106511248, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=474272604, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=457562369, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=495049404, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=609735876, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=129884074, splitter='best'),\n",
       " DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                        max_depth=6, max_features='auto', max_leaf_nodes=None,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                        random_state=1933877961, splitter='best')]"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "forest.estimators_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import export_graphviz\n",
    "export_graphviz(forest.estimators_[1], out_file = \"tree.dot\", feature_names = X.columns, filled=True)\n",
    "!dot -Tpng tree.dot -o tree.png"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training accuracy:  0.6942857142857143\n",
      "test accuracy:  0.7133333333333334\n",
      "training precision:  0.6942857142857143\n",
      "test precision:  0.7133333333333334\n",
      "training recall:  1.0\n",
      "test recall:  1.0\n"
     ]
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier(max_depth=1, min_samples_leaf=10, criterion=\"gini\")\n",
    "\n",
    "bagging = ensemble.BaggingClassifier(est, n_estimators=20, random_state=23)\n",
    "bagging.fit(X_train, y_train)\n",
    "\n",
    "y_train_pred = bagging.predict(X_train)\n",
    "y_test_pred = bagging.predict(X_test)\n",
    "\n",
    "print(\"training accuracy: \", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy: \", metrics.accuracy_score(y_test, y_test_pred))\n",
    "print(\"training precision: \", metrics.precision_score(y_train, y_train_pred))\n",
    "print(\"test precision: \", metrics.precision_score(y_test, y_test_pred))\n",
    "print(\"training recall: \", metrics.recall_score(y_train, y_train_pred))\n",
    "print(\"test recall: \", metrics.recall_score(y_test, y_test_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training accuracy:  0.9228571428571428\n",
      "test accuracy:  0.7233333333333334\n",
      "training precision:  0.9268774703557312\n",
      "test precision:  0.7963800904977375\n",
      "training recall:  0.9650205761316872\n",
      "test recall:  0.822429906542056\n"
     ]
    }
   ],
   "source": [
    "est = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=100, criterion=\"gini\")\n",
    "adaboost = ensemble.AdaBoostClassifier(est, n_estimators=100 )\n",
    "adaboost.fit(X_train, y_train)\n",
    "\n",
    "\n",
    "y_train_pred = adaboost.predict(X_train)\n",
    "y_test_pred = adaboost.predict(X_test)\n",
    "\n",
    "print(\"training accuracy: \", metrics.accuracy_score(y_train, y_train_pred))\n",
    "print(\"test accuracy: \", metrics.accuracy_score(y_test, y_test_pred))\n",
    "print(\"training precision: \", metrics.precision_score(y_train, y_train_pred))\n",
    "print(\"test precision: \", metrics.precision_score(y_test, y_test_pred))\n",
    "print(\"training recall: \", metrics.recall_score(y_train, y_train_pred))\n",
    "print(\"test recall: \", metrics.recall_score(y_test, y_test_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
